In [7]:
# Render our plots inline
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np # we'll need this for sqrt and mean

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

Video 1


In [8]:
NBA = pd.read_csv("NBA_train.csv")

Simply printing out the DataFrame will give us similar information to R's str():


In [38]:
NBA


Out[38]:
SeasonEnd Team Playoffs W PTS oppPTS FG FGA 2P 2PA ... FTA ORB DRB AST STL BLK TOV PTSdiff X2PA X3PA
0 1980 Atlanta Hawks 1 50 8573 8334 3261 7027 3248 6952 ... 2645 1369 2406 1913 782 539 1495 239 6952 75
1 1980 Boston Celtics 1 61 9303 8664 3617 7387 3455 6965 ... 2449 1227 2457 2198 809 308 1539 639 6965 422
2 1980 Chicago Bulls 0 30 8813 9035 3362 6943 3292 6668 ... 2592 1115 2465 2152 704 392 1684 -222 6668 275
3 1980 Cleveland Cavaliers 0 37 9360 9332 3811 8041 3775 7854 ... 2205 1307 2381 2108 764 342 1370 28 7854 187
4 1980 Denver Nuggets 0 30 8878 9240 3462 7470 3379 7215 ... 2539 1311 2524 2079 746 404 1533 -362 7215 255
5 1980 Detroit Pistons 0 16 8933 9609 3643 7596 3586 7377 ... 2149 1226 2415 1950 783 562 1742 -676 7377 219
6 1980 Golden State Warriors 0 24 8493 8853 3527 7318 3500 7197 ... 1914 1155 2437 2028 779 339 1492 -360 7197 121
7 1980 Houston Rockets 1 41 9084 9070 3599 7496 3495 7117 ... 2326 1394 2217 2149 782 373 1565 14 7117 379
8 1980 Indiana Pacers 0 37 9119 9176 3639 7689 3551 7375 ... 2333 1398 2326 2148 900 530 1517 -57 7375 314
9 1980 Kansas City Kings 1 47 8860 8603 3582 7489 3557 7375 ... 2250 1187 2429 2123 863 356 1439 257 7375 114
10 1980 Los Angeles Lakers 1 60 9438 8954 3898 7368 3878 7268 ... 2092 1085 2653 2413 774 546 1639 484 7268 100
11 1980 Milwaukee Bucks 1 49 9025 8702 3685 7553 3635 7398 ... 2102 1245 2396 2277 778 510 1496 323 7398 155
12 1980 New Jersey Nets 0 34 8879 8975 3456 7504 3371 7206 ... 2406 1229 2535 2094 869 581 1702 -96 7206 298
13 1980 New York Knicks 0 39 9344 9438 3802 7672 3760 7481 ... 2274 1236 2303 2265 881 457 1613 -94 7481 191
14 1980 Philadelphia 76ers 1 59 8949 8603 3523 7156 3496 7031 ... 2431 1187 2635 2226 792 652 1708 346 7031 125
15 1980 Phoenix Suns 1 55 9114 8819 3570 7235 3502 6955 ... 2466 1071 2458 2283 908 344 1629 295 6955 280
16 1980 Portland Trail Blazers 1 38 8402 8469 3408 7167 3382 7035 ... 2100 1295 2408 1898 708 472 1552 -67 7035 132
17 1980 San Antonio Spurs 1 41 9788 9819 3856 7738 3804 7532 ... 2528 1153 2515 2326 771 333 1589 -31 7532 206
18 1980 San Diego Clippers 0 35 8820 9160 3524 7494 3347 6951 ... 2167 1294 2308 1688 664 288 1443 -340 6951 543
19 1980 Seattle SuperSonics 1 56 8897 8515 3554 7565 3495 7376 ... 2253 1380 2550 2043 750 428 1496 382 7376 189
20 1980 Utah Jazz 0 24 8394 8887 3382 6817 3323 6632 ... 1943 967 2359 2005 656 362 1543 -493 6632 185
21 1980 Washington Bullets 1 39 8773 8982 3574 7796 3501 7558 ... 2048 1334 2723 2201 530 443 1380 -209 7558 238
22 1981 Atlanta Hawks 0 31 8604 8858 3291 6866 3281 6784 ... 2590 1201 2224 1846 749 469 1605 -254 6784 82
23 1981 Boston Celtics 1 62 9008 8526 3581 7099 3516 6858 ... 2369 1155 2424 2202 683 594 1577 482 6858 241
24 1981 Chicago Bulls 1 45 8937 8775 3457 6903 3419 6724 ... 2563 1227 2475 1925 729 514 1672 162 6724 179
25 1981 Cleveland Cavaliers 0 28 8670 9068 3556 7609 3484 7360 ... 1909 1258 2243 2007 632 322 1396 -398 7360 249
26 1981 Dallas Mavericks 0 15 8322 9011 3204 6928 3158 6763 ... 2487 1109 2177 1984 561 214 1439 -689 6763 165
27 1981 Denver Nuggets 0 37 9986 10025 3784 7960 3754 7815 ... 3051 1325 2497 2030 720 380 1444 -39 7815 145
28 1981 Detroit Pistons 0 21 8174 8692 3236 6986 3223 6902 ... 2330 1201 2111 1819 884 492 1759 -518 6902 84
29 1981 Golden State Warriors 0 39 9006 9103 3560 7284 3500 7074 ... 2513 1403 2366 2026 611 301 1547 -97 7074 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
805 2011 Atlanta Hawks 1 44 7790 7857 2971 6429 2469 5002 ... 1728 762 2460 1802 497 341 1118 -67 5002 1427
806 2011 Boston Celtics 1 56 7913 7473 3023 6219 2615 5100 ... 1895 639 2542 1921 676 344 1195 440 5100 1119
807 2011 Charlotte Bobcats 0 34 7650 7978 2873 6365 2480 5162 ... 1999 848 2444 1728 524 433 1192 -328 5162 1203
808 2011 Chicago Bulls 1 62 8087 7487 3042 6587 2531 5172 ... 2008 967 2654 1827 592 468 1161 600 5172 1415
809 2011 Cleveland Cavaliers 0 19 7827 8566 2886 6647 2377 5158 ... 2075 856 2449 1720 544 341 1166 -739 5158 1489
810 2011 Dallas Mavericks 1 57 8220 7873 3069 6463 2424 4695 ... 1850 780 2618 1954 557 352 1145 347 4695 1768
811 2011 Denver Nuggets 1 50 8811 8421 3145 6613 2483 4909 ... 2429 791 2652 1813 605 352 1157 390 4909 1704
812 2011 Detroit Pistons 0 30 7951 8246 3056 6647 2584 5391 ... 1854 931 2236 1730 595 328 1067 -295 5391 1256
813 2011 Golden State Warriors 0 36 8477 8668 3251 7047 2566 5298 ... 1695 955 2370 1847 737 406 1198 -191 5298 1749
814 2011 Houston Rockets 0 43 8685 8506 3170 6975 2493 5132 ... 2083 962 2549 1955 581 371 1110 179 5132 1843
815 2011 Indiana Pacers 1 37 8183 8271 3003 6787 2418 5134 ... 2035 914 2657 1611 584 456 1262 -88 5134 1653
816 2011 Los Angeles Clippers 0 32 8089 8346 3015 6594 2502 5075 ... 2187 955 2501 1813 585 402 1343 -257 5075 1519
817 2011 Los Angeles Lakers 1 57 8321 7820 3128 6757 2604 5270 ... 1979 989 2616 1801 602 422 1073 501 5270 1487
818 2011 Memphis Grizzlies 1 46 8195 8003 3200 6801 2891 5875 ... 1981 970 2391 1691 771 441 1145 192 5875 926
819 2011 Miami Heat 1 58 8369 7757 3031 6301 2484 4822 ... 2288 790 2666 1639 544 430 1142 612 4822 1479
820 2011 Milwaukee Bucks 0 35 7534 7603 2814 6544 2331 5130 ... 1881 862 2480 1545 617 399 1103 -69 5130 1414
821 2011 Minnesota Timberwolves 0 17 8288 8832 3090 7014 2501 5449 ... 1977 1085 2556 1650 592 422 1398 -544 5449 1565
822 2011 New Jersey Nets 0 24 7722 8234 2918 6638 2459 5301 ... 1881 909 2440 1723 458 384 1152 -512 5301 1337
823 2011 New Orleans Hornets 1 46 7784 7711 2944 6416 2500 5184 ... 1897 824 2468 1691 624 359 1069 73 5184 1232
824 2011 New York Knicks 1 42 8734 8670 3140 6867 2375 4786 ... 2087 847 2470 1757 625 475 1123 64 4786 2081
825 2011 Oklahoma City Thunder 1 55 8596 8285 3066 6609 2579 5206 ... 2401 903 2604 1672 654 487 1156 311 5206 1403
826 2011 Orlando Magic 1 52 8135 7687 2956 6411 2186 4308 ... 2101 864 2679 1636 548 384 1224 448 4308 2103
827 2011 Philadelphia 76ers 1 41 8119 7996 3125 6776 2682 5528 ... 1851 850 2578 1861 621 355 1063 123 5528 1248
828 2011 Phoenix Suns 0 40 8611 8684 3219 6844 2518 4987 ... 1939 821 2478 1945 545 357 1169 -73 4987 1857
829 2011 Portland Trail Blazers 1 48 7896 7771 2951 6599 2433 5096 ... 1835 996 2230 1736 660 358 1070 125 5096 1503
830 2011 Sacramento Kings 0 24 8151 8589 3134 6979 2706 5702 ... 1981 1071 2526 1675 608 391 1324 -438 5702 1277
831 2011 San Antonio Spurs 1 61 8502 8034 3148 6628 2463 4901 ... 1984 829 2603 1836 602 372 1101 468 4901 1727
832 2011 Toronto Raptors 0 22 8124 8639 3144 6755 2799 5664 ... 1976 963 2343 1795 581 350 1206 -515 5664 1091
833 2011 Utah Jazz 0 39 8153 8303 3064 6590 2629 5334 ... 2061 898 2338 1921 629 484 1175 -150 5334 1256
834 2011 Washington Wizards 0 23 7977 8584 3048 6888 2656 5706 ... 1999 1013 2374 1592 665 502 1258 -607 5706 1182

835 rows × 23 columns

And for the summary we'll use an equivalent method, DataFrame.describe():


In [37]:
NBA.describe()


Out[37]:
SeasonEnd Playoffs W PTS oppPTS FG FGA 2P 2PA 3P ... FTA ORB DRB AST STL BLK TOV PTSdiff X2PA X3PA
count 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 ... 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000 835.000000
mean 1996.319760 0.574850 41.000000 8370.239521 8370.239521 3200.367665 6873.318563 2881.324551 5956.444311 319.043114 ... 2189.953293 1061.584431 2427.354491 1912.112575 668.364072 419.805988 1302.837126 0.000000 5956.444311 916.874251
std 9.243808 0.494662 12.740822 581.040114 587.543959 287.181266 401.027166 446.097941 830.596327 199.698941 ... 244.491086 150.224519 130.671523 221.610925 93.393044 82.274913 153.973470 379.547673 830.596327 523.982964
min 1980.000000 0.000000 11.000000 6901.000000 6909.000000 2565.000000 5972.000000 1981.000000 4153.000000 10.000000 ... 1475.000000 639.000000 2044.000000 1423.000000 455.000000 204.000000 931.000000 -1246.000000 4153.000000 75.000000
25% 1989.000000 0.000000 31.000000 7934.000000 7934.000000 2974.000000 6563.500000 2510.000000 5269.000000 131.500000 ... 2008.000000 953.500000 2346.500000 1735.000000 599.000000 359.000000 1192.000000 -268.000000 5269.000000 413.000000
50% 1996.000000 1.000000 42.000000 8312.000000 8365.000000 3150.000000 6831.000000 2718.000000 5706.000000 329.000000 ... 2176.000000 1055.000000 2433.000000 1899.000000 658.000000 410.000000 1289.000000 21.000000 5706.000000 942.000000
75% 2005.000000 1.000000 50.500000 8784.500000 8768.500000 3434.500000 7157.000000 3296.000000 6753.500000 481.500000 ... 2352.000000 1167.000000 2516.500000 2077.500000 729.000000 469.500000 1395.500000 287.500000 6753.500000 1347.500000
max 2011.000000 1.000000 72.000000 10371.000000 10723.000000 3980.000000 8868.000000 3954.000000 7873.000000 841.000000 ... 3051.000000 1520.000000 2753.000000 2575.000000 1053.000000 716.000000 1873.000000 1004.000000 7873.000000 2284.000000

8 rows × 22 columns

Video 2

Now we want to create a table with Playoffs and Wins. In R, the command is table(NBA$W, NBA$Playoffs).


In [10]:
NBA[['Playoffs', 'W']].groupby('W').aggregate(sum)


Out[10]:
Playoffs
W
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 1
31 1
32 0
33 0
34 0
35 3
36 4
37 4
38 7
39 10
40 13
41 26
42 29
43 18
44 27
45 22
46 15
47 28
48 14
49 17
50 32
51 12
52 20
53 17
54 18
55 24
56 16
57 23
58 13
59 14
60 8
61 10
62 13
63 7
64 3
65 3
66 2
67 4
69 1
72 1

Create a new column in NBA, which contains the difference between points scored and lost. (R: NBA$PTSdiff = NBA$PTS - NBA$oppPTS)


In [11]:
NBA['PTSdiff'] = NBA['PTS'] - NBA['oppPTS']

In [12]:
NBA


Out[12]:
SeasonEnd Team Playoffs W PTS oppPTS FG FGA 2P 2PA ... 3PA FT FTA ORB DRB AST STL BLK TOV PTSdiff
0 1980 Atlanta Hawks 1 50 8573 8334 3261 7027 3248 6952 ... 75 2038 2645 1369 2406 1913 782 539 1495 239
1 1980 Boston Celtics 1 61 9303 8664 3617 7387 3455 6965 ... 422 1907 2449 1227 2457 2198 809 308 1539 639
2 1980 Chicago Bulls 0 30 8813 9035 3362 6943 3292 6668 ... 275 2019 2592 1115 2465 2152 704 392 1684 -222
3 1980 Cleveland Cavaliers 0 37 9360 9332 3811 8041 3775 7854 ... 187 1702 2205 1307 2381 2108 764 342 1370 28
4 1980 Denver Nuggets 0 30 8878 9240 3462 7470 3379 7215 ... 255 1871 2539 1311 2524 2079 746 404 1533 -362
5 1980 Detroit Pistons 0 16 8933 9609 3643 7596 3586 7377 ... 219 1590 2149 1226 2415 1950 783 562 1742 -676
6 1980 Golden State Warriors 0 24 8493 8853 3527 7318 3500 7197 ... 121 1412 1914 1155 2437 2028 779 339 1492 -360
7 1980 Houston Rockets 1 41 9084 9070 3599 7496 3495 7117 ... 379 1782 2326 1394 2217 2149 782 373 1565 14
8 1980 Indiana Pacers 0 37 9119 9176 3639 7689 3551 7375 ... 314 1753 2333 1398 2326 2148 900 530 1517 -57
9 1980 Kansas City Kings 1 47 8860 8603 3582 7489 3557 7375 ... 114 1671 2250 1187 2429 2123 863 356 1439 257
10 1980 Los Angeles Lakers 1 60 9438 8954 3898 7368 3878 7268 ... 100 1622 2092 1085 2653 2413 774 546 1639 484
11 1980 Milwaukee Bucks 1 49 9025 8702 3685 7553 3635 7398 ... 155 1605 2102 1245 2396 2277 778 510 1496 323
12 1980 New Jersey Nets 0 34 8879 8975 3456 7504 3371 7206 ... 298 1882 2406 1229 2535 2094 869 581 1702 -96
13 1980 New York Knicks 0 39 9344 9438 3802 7672 3760 7481 ... 191 1698 2274 1236 2303 2265 881 457 1613 -94
14 1980 Philadelphia 76ers 1 59 8949 8603 3523 7156 3496 7031 ... 125 1876 2431 1187 2635 2226 792 652 1708 346
15 1980 Phoenix Suns 1 55 9114 8819 3570 7235 3502 6955 ... 280 1906 2466 1071 2458 2283 908 344 1629 295
16 1980 Portland Trail Blazers 1 38 8402 8469 3408 7167 3382 7035 ... 132 1560 2100 1295 2408 1898 708 472 1552 -67
17 1980 San Antonio Spurs 1 41 9788 9819 3856 7738 3804 7532 ... 206 2024 2528 1153 2515 2326 771 333 1589 -31
18 1980 San Diego Clippers 0 35 8820 9160 3524 7494 3347 6951 ... 543 1595 2167 1294 2308 1688 664 288 1443 -340
19 1980 Seattle SuperSonics 1 56 8897 8515 3554 7565 3495 7376 ... 189 1730 2253 1380 2550 2043 750 428 1496 382
20 1980 Utah Jazz 0 24 8394 8887 3382 6817 3323 6632 ... 185 1571 1943 967 2359 2005 656 362 1543 -493
21 1980 Washington Bullets 1 39 8773 8982 3574 7796 3501 7558 ... 238 1552 2048 1334 2723 2201 530 443 1380 -209
22 1981 Atlanta Hawks 0 31 8604 8858 3291 6866 3281 6784 ... 82 2012 2590 1201 2224 1846 749 469 1605 -254
23 1981 Boston Celtics 1 62 9008 8526 3581 7099 3516 6858 ... 241 1781 2369 1155 2424 2202 683 594 1577 482
24 1981 Chicago Bulls 1 45 8937 8775 3457 6903 3419 6724 ... 179 1985 2563 1227 2475 1925 729 514 1672 162
25 1981 Cleveland Cavaliers 0 28 8670 9068 3556 7609 3484 7360 ... 249 1486 1909 1258 2243 2007 632 322 1396 -398
26 1981 Dallas Mavericks 0 15 8322 9011 3204 6928 3158 6763 ... 165 1868 2487 1109 2177 1984 561 214 1439 -689
27 1981 Denver Nuggets 0 37 9986 10025 3784 7960 3754 7815 ... 145 2388 3051 1325 2497 2030 720 380 1444 -39
28 1981 Detroit Pistons 0 21 8174 8692 3236 6986 3223 6902 ... 84 1689 2330 1201 2111 1819 884 492 1759 -518
29 1981 Golden State Warriors 0 39 9006 9103 3560 7284 3500 7074 ... 210 1826 2513 1403 2366 2026 611 301 1547 -97
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
805 2011 Atlanta Hawks 1 44 7790 7857 2971 6429 2469 5002 ... 1427 1346 1728 762 2460 1802 497 341 1118 -67
806 2011 Boston Celtics 1 56 7913 7473 3023 6219 2615 5100 ... 1119 1459 1895 639 2542 1921 676 344 1195 440
807 2011 Charlotte Bobcats 0 34 7650 7978 2873 6365 2480 5162 ... 1203 1511 1999 848 2444 1728 524 433 1192 -328
808 2011 Chicago Bulls 1 62 8087 7487 3042 6587 2531 5172 ... 1415 1492 2008 967 2654 1827 592 468 1161 600
809 2011 Cleveland Cavaliers 0 19 7827 8566 2886 6647 2377 5158 ... 1489 1546 2075 856 2449 1720 544 341 1166 -739
810 2011 Dallas Mavericks 1 57 8220 7873 3069 6463 2424 4695 ... 1768 1437 1850 780 2618 1954 557 352 1145 347
811 2011 Denver Nuggets 1 50 8811 8421 3145 6613 2483 4909 ... 1704 1859 2429 791 2652 1813 605 352 1157 390
812 2011 Detroit Pistons 0 30 7951 8246 3056 6647 2584 5391 ... 1256 1367 1854 931 2236 1730 595 328 1067 -295
813 2011 Golden State Warriors 0 36 8477 8668 3251 7047 2566 5298 ... 1749 1290 1695 955 2370 1847 737 406 1198 -191
814 2011 Houston Rockets 0 43 8685 8506 3170 6975 2493 5132 ... 1843 1668 2083 962 2549 1955 581 371 1110 179
815 2011 Indiana Pacers 1 37 8183 8271 3003 6787 2418 5134 ... 1653 1592 2035 914 2657 1611 584 456 1262 -88
816 2011 Los Angeles Clippers 0 32 8089 8346 3015 6594 2502 5075 ... 1519 1546 2187 955 2501 1813 585 402 1343 -257
817 2011 Los Angeles Lakers 1 57 8321 7820 3128 6757 2604 5270 ... 1487 1541 1979 989 2616 1801 602 422 1073 501
818 2011 Memphis Grizzlies 1 46 8195 8003 3200 6801 2891 5875 ... 926 1486 1981 970 2391 1691 771 441 1145 192
819 2011 Miami Heat 1 58 8369 7757 3031 6301 2484 4822 ... 1479 1760 2288 790 2666 1639 544 430 1142 612
820 2011 Milwaukee Bucks 0 35 7534 7603 2814 6544 2331 5130 ... 1414 1423 1881 862 2480 1545 617 399 1103 -69
821 2011 Minnesota Timberwolves 0 17 8288 8832 3090 7014 2501 5449 ... 1565 1519 1977 1085 2556 1650 592 422 1398 -544
822 2011 New Jersey Nets 0 24 7722 8234 2918 6638 2459 5301 ... 1337 1427 1881 909 2440 1723 458 384 1152 -512
823 2011 New Orleans Hornets 1 46 7784 7711 2944 6416 2500 5184 ... 1232 1452 1897 824 2468 1691 624 359 1069 73
824 2011 New York Knicks 1 42 8734 8670 3140 6867 2375 4786 ... 2081 1689 2087 847 2470 1757 625 475 1123 64
825 2011 Oklahoma City Thunder 1 55 8596 8285 3066 6609 2579 5206 ... 1403 1977 2401 903 2604 1672 654 487 1156 311
826 2011 Orlando Magic 1 52 8135 7687 2956 6411 2186 4308 ... 2103 1453 2101 864 2679 1636 548 384 1224 448
827 2011 Philadelphia 76ers 1 41 8119 7996 3125 6776 2682 5528 ... 1248 1426 1851 850 2578 1861 621 355 1063 123
828 2011 Phoenix Suns 0 40 8611 8684 3219 6844 2518 4987 ... 1857 1472 1939 821 2478 1945 545 357 1169 -73
829 2011 Portland Trail Blazers 1 48 7896 7771 2951 6599 2433 5096 ... 1503 1476 1835 996 2230 1736 660 358 1070 125
830 2011 Sacramento Kings 0 24 8151 8589 3134 6979 2706 5702 ... 1277 1455 1981 1071 2526 1675 608 391 1324 -438
831 2011 San Antonio Spurs 1 61 8502 8034 3148 6628 2463 4901 ... 1727 1521 1984 829 2603 1836 602 372 1101 468
832 2011 Toronto Raptors 0 22 8124 8639 3144 6755 2799 5664 ... 1091 1491 1976 963 2343 1795 581 350 1206 -515
833 2011 Utah Jazz 0 39 8153 8303 3064 6590 2629 5334 ... 1256 1590 2061 898 2338 1921 629 484 1175 -150
834 2011 Washington Wizards 0 23 7977 8584 3048 6888 2656 5706 ... 1182 1489 1999 1013 2374 1592 665 502 1258 -607

835 rows × 21 columns

Now we plot Wins as a function of points difference to get an idea whether there is a corellation.


In [13]:
NBA.plot(x='PTSdiff', y='W', kind='scatter')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fae1c16e160>

To build a linear regression model, I followed this blog post. We need to import LinearRegression from scikit-learn:


In [14]:
from sklearn.linear_model import LinearRegression

And build the model:


In [15]:
lr = LinearRegression()
lr.fit(NBA[['PTSdiff']], NBA[['W']])


Out[15]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

I didn't find a method that would print a nice R-like summary of the model in python, so we need to print them manually:


In [16]:
print(lr.score(NBA[['PTSdiff']], NBA[['W']]))    # R^2
print(lr.intercept_)
print(lr.coef_)


0.94234248197
[ 41.]
[[ 0.03258633]]

I've just found a method in pandas that prints out a summary!


In [17]:
from pandas.stats.api import ols
model = ols(x=NBA['PTSdiff'], y=NBA['W'])

In [18]:
model


Out[18]:
-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         835
Number of Degrees of Freedom:   2

R-squared:         0.9423
Adj R-squared:     0.9423

Rmse:              3.0612

F-stat (1, 833): 13614.3787, p-value:     0.0000

Degrees of Freedom: model 1, resid 833

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x     0.0326     0.0003     116.68     0.0000     0.0320     0.0331
     intercept    41.0000     0.1059     387.03     0.0000    40.7924    41.2076
---------------------------------End of Summary---------------------------------

Multiple linear regression (Video 3)

There are several ways of creating a multidimensional model, but I like this one: link.

First, we need to convert column names like in R --- otherwise the methods will throw an error


In [19]:
NBA['X2PA'] = NBA['2PA']
NBA['X3PA'] = NBA['3PA']

In [20]:
import statsmodels.formula.api as sm
model = sm.ols(formula="PTS ~ X2PA + X3PA + FTA + AST + ORB + DRB + TOV + STL + BLK", data=NBA).fit()

In [21]:
model.summary()


Out[21]:
OLS Regression Results
Dep. Variable: PTS R-squared: 0.899
Model: OLS Adj. R-squared: 0.898
Method: Least Squares F-statistic: 817.3
Date: Thu, 19 Mar 2015 Prob (F-statistic): 0.00
Time: 21:18:08 Log-Likelihood: -5541.1
No. Observations: 835 AIC: 1.110e+04
Df Residuals: 825 BIC: 1.115e+04
Df Model: 9
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -2050.8108 203.487 -10.078 0.000 -2450.223 -1651.398
X2PA 1.0429 0.030 35.274 0.000 0.985 1.101
X3PA 1.2586 0.038 32.747 0.000 1.183 1.334
FTA 1.1280 0.034 33.440 0.000 1.062 1.194
AST 0.8858 0.044 20.150 0.000 0.799 0.972
ORB -0.9554 0.078 -12.261 0.000 -1.108 -0.802
DRB 0.0388 0.062 0.631 0.528 -0.082 0.160
TOV -0.0248 0.061 -0.405 0.686 -0.145 0.095
STL -0.1992 0.092 -2.169 0.030 -0.379 -0.019
BLK -0.0558 0.088 -0.635 0.526 -0.228 0.117
Omnibus: 1.265 Durbin-Watson: 1.820
Prob(Omnibus): 0.531 Jarque-Bera (JB): 1.301
Skew: -0.094 Prob(JB): 0.522
Kurtosis: 2.951 Cond. No. 2.34e+05

Let's print the residuals:


In [22]:
model.resid


Out[22]:
0      38.572271
1     142.872004
2     -92.895718
3      -8.391347
4    -258.470561
5     171.460833
6     150.408162
7     169.381143
8      40.775620
9     -75.325661
10    444.908874
11     94.386470
12   -205.680905
13    113.596904
14     64.199400
...
820   -135.417211
821    108.267709
822   -171.341020
823    102.443908
824    156.082920
825    210.052169
826    109.490894
827    -20.535417
828     59.284572
829    175.923527
830     30.653182
831    262.672801
832     70.067186
833    -17.578942
834     -8.339305
Length: 835, dtype: float64

In [23]:
SSE = sum(model.resid**2)
SSE


Out[23]:
28394313.994756646

In [24]:
RMSE = np.sqrt(SSE/len(NBA))
RMSE


Out[24]:
184.40489814749066

In [25]:
np.mean(NBA['PTS'])


Out[25]:
8370.2395209580845

In [26]:
model1 = sm.ols(formula="PTS ~ X2PA + X3PA + FTA + AST + ORB + DRB + STL + BLK", data=NBA).fit()
model1.summary()


Out[26]:
OLS Regression Results
Dep. Variable: PTS R-squared: 0.899
Model: OLS Adj. R-squared: 0.898
Method: Least Squares F-statistic: 920.4
Date: Thu, 19 Mar 2015 Prob (F-statistic): 0.00
Time: 21:18:08 Log-Likelihood: -5541.2
No. Observations: 835 AIC: 1.110e+04
Df Residuals: 826 BIC: 1.114e+04
Df Model: 8
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -2076.6782 193.084 -10.755 0.000 -2455.672 -1697.684
X2PA 1.0435 0.030 35.366 0.000 0.986 1.101
X3PA 1.2627 0.037 34.099 0.000 1.190 1.335
FTA 1.1254 0.033 34.023 0.000 1.060 1.190
AST 0.8861 0.044 20.173 0.000 0.800 0.972
ORB -0.9582 0.078 -12.350 0.000 -1.110 -0.806
DRB 0.0389 0.062 0.632 0.527 -0.082 0.160
STL -0.2068 0.090 -2.301 0.022 -0.383 -0.030
BLK -0.0586 0.087 -0.670 0.503 -0.230 0.113
Omnibus: 1.258 Durbin-Watson: 1.824
Prob(Omnibus): 0.533 Jarque-Bera (JB): 1.296
Skew: -0.093 Prob(JB): 0.523
Kurtosis: 2.950 Cond. No. 2.19e+05

In [27]:
model2 = sm.ols(formula="PTS ~ X2PA + X3PA + FTA + AST + ORB + STL + BLK", data=NBA).fit()
model2.summary()


Out[27]:
OLS Regression Results
Dep. Variable: PTS R-squared: 0.899
Model: OLS Adj. R-squared: 0.898
Method: Least Squares F-statistic: 1053.
Date: Thu, 19 Mar 2015 Prob (F-statistic): 0.00
Time: 21:18:08 Log-Likelihood: -5541.4
No. Observations: 835 AIC: 1.110e+04
Df Residuals: 827 BIC: 1.114e+04
Df Model: 7
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -2015.4630 167.009 -12.068 0.000 -2343.274 -1687.652
X2PA 1.0483 0.029 36.753 0.000 0.992 1.104
X3PA 1.2708 0.035 36.568 0.000 1.203 1.339
FTA 1.1285 0.033 34.506 0.000 1.064 1.193
AST 0.8909 0.043 20.597 0.000 0.806 0.976
ORB -0.9702 0.075 -12.903 0.000 -1.118 -0.823
STL -0.2276 0.084 -2.724 0.007 -0.392 -0.064
BLK -0.0388 0.082 -0.475 0.635 -0.199 0.121
Omnibus: 1.168 Durbin-Watson: 1.834
Prob(Omnibus): 0.558 Jarque-Bera (JB): 1.208
Skew: -0.090 Prob(JB): 0.546
Kurtosis: 2.948 Cond. No. 1.79e+05

In [28]:
model3 = sm.ols(formula="PTS ~ X2PA + X3PA + FTA + AST + ORB + STL", data=NBA).fit()
model3.summary()


Out[28]:
OLS Regression Results
Dep. Variable: PTS R-squared: 0.899
Model: OLS Adj. R-squared: 0.898
Method: Least Squares F-statistic: 1229.
Date: Thu, 19 Mar 2015 Prob (F-statistic): 0.00
Time: 21:18:08 Log-Likelihood: -5541.5
No. Observations: 835 AIC: 1.110e+04
Df Residuals: 828 BIC: 1.113e+04
Df Model: 6
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -2032.7164 162.942 -12.475 0.000 -2352.544 -1712.889
X2PA 1.0500 0.028 37.117 0.000 0.994 1.105
X3PA 1.2731 0.034 37.001 0.000 1.206 1.341
FTA 1.1273 0.033 34.581 0.000 1.063 1.191
AST 0.8884 0.043 20.701 0.000 0.804 0.973
ORB -0.9743 0.075 -13.051 0.000 -1.121 -0.828
STL -0.2268 0.084 -2.717 0.007 -0.391 -0.063
Omnibus: 1.174 Durbin-Watson: 1.834
Prob(Omnibus): 0.556 Jarque-Bera (JB): 1.222
Skew: -0.089 Prob(JB): 0.543
Kurtosis: 2.942 Cond. No. 1.74e+05

In [29]:
SSE3 = sum(model3.resid**2)
SSE3


Out[29]:
28421464.862623505

In [30]:
RMSE3 = np.sqrt(SSE3/len(NBA))
RMSE3


Out[30]:
184.49304179347197

Video 4: Making predictions


In [31]:
NBA_test = pd.read_csv("NBA_test.csv")
NBA_test['X2PA'] = NBA_test['2PA']
NBA_test['X3PA'] = NBA_test['3PA']

In [32]:
prediction = model3.predict(NBA_test)

In [33]:
SSE_pred = sum((prediction-NBA_test['PTS'])**2)

In [34]:
SST_pred = sum((np.mean(NBA['PTS']) - NBA_test['PTS'])**2)

In [35]:
R2 = 1-SSE_pred/SST_pred
R2


Out[35]:
0.81271418527713002

In [36]:
RMSE_pred = np.sqrt(SSE_pred/len(NBA_test))
RMSE_pred


Out[36]:
196.3723439642647

In [ ]: